home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Collection of Internet
/
Collection of Internet.iso
/
infosrvr
/
dev
/
scott
/
WWW
/
NextStep
/
Implementation
/
ParseHTML.h
< prev
next >
Wrap
Text File
|
1993-08-26
|
11KB
|
364 lines
/* FORMAT CONVERSION FROM SGML
** ===========================
**
**
** 22 Nov 92 Fixed quoting of hrefs.
** CERN_WEIRDO ifdefed out -- proper SGML expected
** REMOVE_SCRIPT ifdefed out -- did ignore lines starting with "."
*/
#import "HTStyle.h"
#include "HTML.h" /* For directory object building */
#define PUTC(c) (*targetClass.put_character)(target, c)
#define PUTS(s) (*targetClass.put_string)(target, s)
#define START(e) (*targetClass.start_element)(target, e, 0, 0)
#define END(e) (*targetClass.end_element)(target, e)
#define END_TARGET (*targetClass.end_document)(target)
#define FREE_TARGET (*targetClass.free)(target)
struct _HTStream {
CONST HTStreamClass * isa;
/* ... */
};
typedef struct _SGML_style {
char * start_tag; /* Tag to mark start of a style */
char * paragraph_tag; /* Tag to mark paragraph mark within style */
char * tab_tag; /* Tag to mark tab within style */
char * end_tag; /* Tag to mark end of style */
char * start_text; /* Text conventionally starting this style */
char * paragraph_text; /* Text used as a paragraph mark within style*/
char * end_text; /* Text used to end a style */
HTStyle * style; /* Paragraph style to be used */
int free_format; /* Flag: are line ends word breaks only? */
int litteral; /* Flag: end only at close tag (cheat) ? */
} SGML_style;
/* Stack of previous styles:
*/
typedef struct _NestedStyle {
struct _NestedStyle * next; /* previously nested style or 0 */
SGML_style * SGML; /* SGML style interrupted */
} NestedStyle;
/* Paragraph Styles used by the SGML parser:
** ----------------------------------------
*/
static SGML_style Normal =
{ "", "<P>\n", "\t", "",
"","", "", 0 ,1, 0};
static SGML_style Heading[6] = {
{ "\n<H1>", "</H1>\n<H1>", "\t", "</H1>", "", "", "", 0, 1, 0},
{ "\n<H2>", "</H2>\n<H2>", "\t", "</H2>", "", "", "", 0, 1, 0},
{ "\n<H3>", "</H3>\n<H3>", "\t", "</H3>", "", "", "", 0, 1, 0},
{ "\n<H4>", "</H4>\n<H4>", "\t", "</H4>", "", "", "", 0, 1, 0},
{ "\n<H5>", "</H5>\n<H5>", "\t", "</H5>", "", "", "", 0, 1, 0},
{ "\n<H6>", "</H6>\n<H6>", "\t", "</H6>", "", "", "", 0, 1, 0}
};
static SGML_style Glossary = /* Large hanging indent with tab */
{ "\n<DL>\n<DT>", "\n<DT>", "\n<DD>", "\n</DL>\n",
"", "", "", 0, 1};
static SGML_style listStyle = /* Hanging indent with tab */
{ "\n<UL>\n<LI>", "\n<LI>", "\t", "\n</UL>",
"\267\t", "\267\t", "", 0, 1, 0};
static SGML_style menuStyle = /* Like UL but less gap */
{ "\n<MENU>\n<LI>", "\n<LI>", "\t", "\n</MENU>",
"\267\t", "\267\t", "", 0, 1, 0};
static SGML_style addressStyle =
{ "\n<ADDRESS>", "<P>", "\t", "\n</ADDRESS>",
"", "", "", 0, 1, 0 };
/* Explicit format styles:
*/
static SGML_style Example = /* Fixed width font, at least 80 chars wide */
{ "\n<XMP>", "\n", "\t", "</XMP>",
"", "", "", 0 , 0, 1};
static SGML_style Preformatted = /* Fixed width font, at least 80 chars wide */
{ "\n<PRE>", "\n", "\t", "</PRE>",
"", "", "", 0 , 0, 0}; /* not litteral */
static SGML_style Fixed = /* Fixed width font, at least 80 chars wide */
{ "\n<FIXED>", "<P>", "\t", "</FIXED>",
"", "", "", 0 , 1, 0};
static SGML_style Listing = /* Fixed width font, at least 132 chars wide */
{ "\n<LISTING>", "\n", "\t", "</LISTING>",
"", "", "", 0 , 0, 1};
/* Table of all possible SGML paragraph styles
*/
static SGML_style * styleTable[] = {
&Normal, &Heading[0], &Heading[1], &Heading[2],
&Heading[3], &Heading[4], &Heading[5],
&Glossary, &listStyle, &menuStyle, &addressStyle, &Preformatted, &Fixed, &Example, &Listing
}; /* style table */
#define NUMBER_OF_STYLES (sizeof(styleTable)/sizeof(styleTable[0]))
/* Write SGML File back OUT
** ------------------------
**
** This is currently quite NeXT-specific.
**
** We run through te runs. When a characteristic of a run changes, we
** output the approporiate SGML code. When several characteristics change at
** the same place, we output the code in an order such that the resulting
** structures wil be nested. This means first unwrapping the old ones, and
** then entering the new ones. For example, it is better to produce
**
** <h2><a>...</a></h2><a>...</a>
** than
**
** <h2><a>...</h2></a><a>...</a>
**
** The special treatment of newlines is because we want to strip extra newlines
** out. We ignore newlines at the beginning and end of the para style,
** and we treat multiple newlines as a single paragraph mark.
**
** Bugs: @@@ Highlighting is ignored.
** @@@ end text is ignored.
*/
#define LINE_WRAP 64 /* Start thinking about line wrap here */
static int SGML_gen_newlines; /* Number of newlines pending during SGML generation */
static SGML_gen_errors; /* Number of unrcognizable runs */
static SGML_style * currentSGML;
static const char * saveName; /* pointer to name node is being saved under */
static char * prefix; /* Pointer to prefix string to be junked */
static int lineLength; /* Number of characters on a line so far */
/* This function, for any paragraph style, finds the SGML style, if any
*/
SGML_style * findSGML(void *para)
{
int i;
if (!para) return &Normal; /* Totally unstyled becomes Normal */
for (i=0; i<NUMBER_OF_STYLES; i++) {
SGML_style * S = styleTable[i];
if (S) {
HTStyle * style = S->style;
if(style) {
if (style->paragraph == para)
return S;
}
}
}
if (TRACE) printf("HT: Can't find SGML style!\n");
SGML_gen_errors++;
return &Normal;
}
/* Change Run
** ==========
*/
/* This function generates the code for one run, given the previous run.
**
*/
- (void) changeRunFrom: (NXRun *) last to: (NXRun *) r
{
int chars_left = r->chars;
if (r->info != last->info) { /* End anchor */
if (last->info) PUTS ("</A>");
}
if (r->paraStyle != last->paraStyle)
if (last->paraStyle) { /* End paragraph */
if (currentSGML) PUTS(currentSGML->end_tag);
else PUTS("<P>\n");
lineLength = 0; /* At column 1 */
}
if (r->paraStyle != last->paraStyle) { /* Start paragraph */
currentSGML = findSGML(r->paraStyle);
if (currentSGML) {
if (currentSGML->free_format)
while(chars_left && WHITE(*read_pointer)) {/* Strip leading */
(chars_left)--; /* white space */
(void) NEXT_TEXT_CHAR;
}
PUTS(currentSGML->start_tag);
prefix = currentSGML->start_text;
}
SGML_gen_newlines=0; /* Cancel */
}
if (r->info != last->info) { /* Start anchor */
if (SGML_gen_newlines) { /* Got anchor, need paragraph separator */
PUTS(currentSGML->paragraph_tag);
SGML_gen_newlines=0; /* paragraph flushed. */
}
if (r->info) {
HTChildAnchor * a = (HTChildAnchor *) r->info;
HTAnchor * d = HTAnchor_followMainLink((HTAnchor*)a);
char * this = HTAnchor_address((HTAnchor*)a);
PUTS("<A\nNAME=");
PUTS(strrchr(this, '#')+1);
free(this);
if (d) {
char * absolute = HTAnchor_address(d);
char * relative = HTRelative(absolute, saveName);
PUTS(" HREF=\"");
PUTS(relative);
PUTC('"');
free(relative);
free(absolute);
}
PUTC('>');
}
}
/* Now output the textual part of the run
**
** Within the prefix region (prefix!=0), we discard white space and
** characters matching *prefix++. Note the prefix string may contain white space.
**
** The SGML_gen_newlines flag means that newlines have been found. They are
** not actually implemented unless some more non-white text is found, so that
** trailing newlines on the end of paragraphs are stripped.
**
** The line wrapping is primitive in the extreme, as only text characters are
** counted. In practise it limits the length of any line to a reasonable amount,
** though this is not guarranteed.
*/
{
while (chars_left) {
char c = NEXT_TEXT_CHAR;
chars_left--;
if (prefix) {
if (*prefix) {
if (c==*prefix) {
++prefix;
continue; /* Strip prefix characters */
}
if (WHITE(c)) continue; /* Strip white space */
if (TRACE) printf(
"HTML: WARNING: Paragraph prefix incomplete: %i found where %i expected.\n",
c, *prefix);
}
prefix=0; /* Prefix is over */
}
if (c=='\n') { /* Paragraph Marks: */
if (currentSGML->free_format) {
SGML_gen_newlines++; /* Just flag it */
prefix = currentSGML->paragraph_text;
} else {
PUTS(currentSGML->paragraph_tag);
}
lineLength = 0; /* At column 1 */
} else { /* Not newline */
if (SGML_gen_newlines) {/* Got text, need paragraph separator */
PUTS(currentSGML->paragraph_tag);
SGML_gen_newlines=0; /* paragraph flushed. */
lineLength = 0; /* At column 1 */
}
if (c=='\t') {
if (currentSGML) PUTS(currentSGML->tab_tag);
else PUTC('\t');
} else { /* Not tab or newline */
lineLength ++; /* @@bug doesn't count entity names */
if ((currentSGML->free_format)
&& (lineLength++ > LINE_WRAP) /* Wrap lines if we can */
&& (c==' ')) {
c = '\n';
lineLength = 0;
}
if (currentSGML->litteral) {
PUTC(c);
} else {
switch(c) {
case '<': PUTS("<"); break;
case '&': PUTS("&"); break;
default: PUTC(c); break;
} /* switch */
} /* not litteral */
}
}
}
}
} /* changeRunFrom:to: */
/* This is the body of the SGML output method.
*/
- writeSGML:(HTStrunctured *) target relativeTo:(const char *)aName
{
NXRun * r = theRuns->runs;
int sor; /* Character position of start of run */
NXRun dummy;
char buffer[64];
dummy.paraStyle = 0;
dummy.info = 0;
dummy.chars = 0;
HTStructuredClass targetClass = *target->isa; /* copy access routines
#define PUTC
SGML_gen_newlines=0; /* Number of newlines read but not inserted */
HT = self;
saveName = aName;
SGML_gen_errors = 0;
currentSGML = 0;
prefix = 0; /* No prefix to junk */
START_INPUT;
lineLength = 0; /* Starting in column 1 */
START(HTML_HTML);
START(HTML_HEAD);
START(HTML_TITLE);
PUTS([window title]);
END(HTML_TITLE);
if (nextAnchorNumber) {
sprintf(buffer, "\n<NEXTID N=\"z%i\">\n", nextAnchorNumber);
PUTS(buffer);
}
END(HTML_HEAD);
START(HTML_BODY);
/* Change style tags etc
*/
[self changeRunFrom:&dummy to:r]; /* Start first run */
for (sor=r++->chars; sor<textLength; sor=sor+(r++)->chars) {
if (TRACE) printf("%4i: %i chars in run %3i.\n",
sor, r->chars, r-theRuns->runs);
[self changeRunFrom:r-1 to: r]; /* Runs 2 to N */
}
[self changeRunFrom:r to:&dummy]; /* Close last run */
tFlags.changeState = 0; /* Please notify delegate if changed */
END(HTML_BODY);
END(HTML_HTML);
return (SGML_gen_errors) ? nil : self;
}